Loading the data

First, we will load our data from the following sources:

# load necessary libraries
import os
import requests
import zipfile
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
import folium
import json
from IPython.display import display, HTML
from folium.plugins import TimeSliderChoropleth
from shapely.ops import nearest_points
# define download paths
base_url = "https://gis.penndot.gov/gishub/crashZip/County/Philadelphia/Philadelphia_"
download_dir = "philly_crash_data"
os.makedirs(download_dir, exist_ok=True)

# create an empty list to store dataframes
crash_data_list = []

# loop through 2018 to 2023
for year in range(2018, 2024):
    print(f"Processing data for year {year}...")
    zip_filename = f"{download_dir}/Philadelphia_{year}.zip"
    csv_filename = f"{download_dir}/CRASH_PHILADELPHIA_{year}.csv"

    # download the zip file if it doesn't exist yet
    if not os.path.exists(zip_filename):
        url = f"{base_url}{year}.zip"
        response = requests.get(url)
        if response.status_code == 200:
            with open(zip_filename, "wb") as file:
                file.write(response.content)
            print(f"Downloaded: {zip_filename}")
        else:
            print(f"Failed to download data for year {year}. URL: {url}")
            continue

    # extract the csv
    if not os.path.exists(csv_filename):
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            csv_files = [name for name in zip_ref.namelist() if name.endswith(f"CRASH_PHILADELPHIA_{year}.csv")]
            if csv_files:
                zip_ref.extract(csv_files[0], download_dir)
                print(f"Extracted: {csv_filename}")
            else:
                print(f"Relevant CSV file not found in {zip_filename}")
                continue

    # load csv as df
    if os.path.exists(csv_filename):
        df = pd.read_csv(csv_filename)
        df['year'] = year
        crash_data_list.append(df)

# combine all years into a single df
if crash_data_list:
    all_crashes = pd.concat(crash_data_list, ignore_index=True)
    print("Combined all years into a single DataFrame.")
else:
    print("No data loaded.")

# delete the zip and csv files
for year in range(2018, 2024):
    zip_filename = f"{download_dir}/Philadelphia_{year}.zip"
    csv_filename = f"{download_dir}/CRASH_PHILADELPHIA_{year}.csv"
    
    if os.path.exists(zip_filename):
        os.remove(zip_filename)
        print(f"Deleted: {zip_filename}")
    
    if os.path.exists(csv_filename):
        os.remove(csv_filename)
        print(f"Deleted: {csv_filename}")
# select for non-motorist crashes only
non_motorist_crashes = all_crashes[all_crashes['NONMOTR_COUNT'] > 0]

# filter out records with missing geography
non_motorist_crashes = non_motorist_crashes.dropna(subset=['DEC_LAT', 'DEC_LONG'])

# convert to gdf using lat and long
geometry = gpd.points_from_xy(non_motorist_crashes['DEC_LONG'], non_motorist_crashes['DEC_LAT'])
non_motorist_gdf = gpd.GeoDataFrame(non_motorist_crashes, geometry=geometry, crs="EPSG:4326")

Visualizing the crash data

Crash point data by year

Now that we have imported and prepared the crash data, we will quickly visualize a subset of them (2021-2023, for the sake of Quarto rendering) in the following interactive map.

Years can be toggled on and off to see distributions of crashes across time. Reducing the number of years visualized as well as zooming into the map reduces the relative size of the dots, making it easier to see the spatial distribution of crashes.

# create a base map centered on Philadelphia
m = folium.Map(location=[39.9526, -75.1652], zoom_start=12, tiles="cartodb positron")

# filter out rows with year < 2021
filtered_gdf = non_motorist_gdf[non_motorist_gdf['year'] >= 2021]

# group crashes by year
year_groups = filtered_gdf.groupby('year')

# create a feature group for each year
year_layers = {}

for year, group in year_groups:
    year_data = group.copy()
    
    # create geometries from lat and long
    year_data['geometry'] = gpd.GeoSeries.from_xy(year_data['DEC_LONG'], year_data['DEC_LAT'])
    year_gdf = gpd.GeoDataFrame(year_data, geometry='geometry', crs="EPSG:4326")
    
    # create a feature group for year
    year_layer = folium.FeatureGroup(name=str(year))
    
    for _, row in year_gdf.iterrows():
        folium.CircleMarker(
            location=[row['DEC_LAT'], row['DEC_LONG']],
            radius=5,
            popup=f"Crash ID: {row['CRN']}<br>Year: {row['year']}",
            color='blue',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6
        ).add_to(year_layer)
    
    # add year layer to dictionary
    year_layers[year] = year_layer

# add all layers to map
for year, layer in year_layers.items():
    layer.add_to(m)

# add layer control to toggle years with checkboxes
folium.LayerControl(collapsed=False).add_to(m)

# display map
m
Make this Notebook Trusted to load map: File -> Trust Notebook
# save non motorist crashes as a geojson file for next notebook
non_motorist_gdf.to_file("non_motorist_gdf.geojson", driver="GeoJSON")

Click this link to proceed to the next section